import numpy as np
import pandas as pd
from jieba import lcut
voc1 = pd.read_pickle("hex_voc_3.pkl")
voc = pd.DataFrame({"voc":voc1.keys(),"count":voc1.values()})
voc["voc_len"]= voc["voc"].str.len()
voc=voc.sort_values(["count","voc_len"],ascending=[False,False])
p=voc["count"]/(voc["count"].sum()+16+16*16+16**3)
p1=(+16+16*16+16**3)/(voc["count"].sum()+16+16*16+16**3)
print(   p1*np.log(p1),np.sum(p*np.log(p)))


voc = voc.head(8888)
voc["hex"]=[i.encode("utf-8").hex() for i in voc["voc"]]
voc["hex_len"]=voc["hex"].str.len()
# plt.plot(sorted(voc["count"].values,reverse=True))
# plt.show()
# voc[3]= voc[0].str.len()
voc={ k:v for k,v in voc[["hex","count"]].values}

for i in "0123456789abcdef":
    voc[i] = voc.get(i, 1)
    for j in "0123456789abcdef":
        voc[i] = voc.get(i + j, 1)
        for ij in "0123456789abcdef":
            voc[i] = voc.get(i + j + ij, 1)
voc = pd.DataFrame({"voc": voc.keys(), "voc_count": voc.values(), "voc_len": [len(i) for i in voc.keys()]})
voc=voc["voc"].values.tolist()
for i,v in enumerate(voc):
    try:
        voc[i]=bytes.fromhex(v).decode("utf-8")
        voc1.remove(voc[i])
    except:
        pass
pd.to_pickle({"voc":voc,"voc1":voc1},"total_voc_new.pkl")